## All my imports
print("Versions")
# Data science
import pandas as pd
print(f"Pandas: {pd.__version__}")
import numpy as np
print(f"Numpy: {np.__version__}")
# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
import os
# Text Stuff
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import sys
sys.path.insert(1, '../')
import research
import re
# ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
plt.style.use("ggplot")
# Creating an object where my NLP EDA functions are located
nlp = research.nlp_eda()
Versions Pandas: 1.2.1 Numpy: 1.20.2
colors = {"coral": "#FC766AFF", "pacific coast":"#5B84B1FF" , "black": "#101820FF", "orange": "#F2AA4CFF"}
colors2 = {"pacific coast":"#5B84B1", "coral":"#FC766A" , "red": "#DC5757", "blue": "#4547CA",
"teal": "#8AF3CC"}
What are the expectations from a skills perspective?
Sandra: Data literacy literature - University is trying to build a scale - if you can do this = level 1, level 2, etc.
Software Development Lifecycle and Research Data Science Life Cycle are different.
MS: Two Diff Types:
What are the skills people are looking for that are different between analyst and scientist.
!ls ../data
OOH heading_counts_clean.csv merged_headings_df.csv chromedriver headings models chromedriver2 lda_results scraping_results heading_counts.csv merged_df.csv
paths = {'all_data': '../data/merged_df.csv', "headings": "../data/merged_headings_df.csv"}
# ingest
all_data = pd.read_csv(paths['all_data'], index_col = 0)
headings_df = pd.read_csv(paths['headings'], index_col = 0)
def clean(seq):
''' This is the preprocessing pipeline that I do with the Random Forest in mind '''
# converting to lowercase
seq = seq.lower()
# removing \n
to_remove = {'\n','analyst','scientist', 'science', 'machine', 'learning','scientists', "experience"}
resultwords = [word for word in re.split("\W+",seq) if word.lower() not in to_remove]
resultwords.remove("") # removing empty strings
# converting back to string
seq = " ".join(resultwords)
return seq
# applying transformation
all_data['Description'] = all_data['Description'].apply(clean)
# subsets
ds = all_data[all_data['Job Title'] == "data scientist"]
da = all_data[all_data['Job Title'] == "data analyst"]
# getting word counts
ds_words = nlp.to_wcdf(ds['Description']).sum()
da_words = nlp.to_wcdf(da['Description']).sum()
all_data
| Description | Location | Job Title | |
|---|---|---|---|
| 0 | this position can be based remotely anywhere i... | new york | data scientist |
| 1 | summary imagine what you could do here at appl... | texas | data scientist |
| 2 | lead analytics and measurement efforts for str... | other | data scientist |
| 3 | about gusto gusto is a modern online people pl... | other | data scientist |
| 4 | summary at apple new ideas have a way of becom... | texas | data scientist |
| ... | ... | ... | ... |
| 355 | the new york post is searching for a web to tu... | new york | other |
| 356 | the metro group inc is an established successf... | new york | business anlayst |
| 357 | it s fun to work in a company where people tru... | new york | data analyst |
| 358 | summary the business intelligence bi solutions... | new york | other |
| 359 | location new york new york global about indigo... | new york | other |
1703 rows × 3 columns
# unit testing
example = all_data['Description'].iloc[0]
to_remove = {'\n','analyst','scientist', 'science'}
resultwords = [word for word in re.split("\W+",example) if word.lower() not in to_remove]
resultwords.remove("")
" ".join(resultwords)[:100]
'this position can be based remotely anywhere in the usa or based in tonawanda ny linde is a leading '
all_data.head()
| Description | Location | Job Title | |
|---|---|---|---|
| 0 | this position can be based remotely anywhere i... | new york | data scientist |
| 1 | summary imagine what you could do here at appl... | texas | data scientist |
| 2 | lead analytics and measurement efforts for str... | other | data scientist |
| 3 | about gusto gusto is a modern online people pl... | other | data scientist |
| 4 | summary at apple new ideas have a way of becom... | texas | data scientist |
def visualize_seq_lengths(seq, title = "Distribution of Word Lengths", color = None):
'''
Parameters:
-----------
seq: A Pandas Series or simply a list
containing sequences of tokens
'''
seq = seq.apply(lambda x: x.split())
print(seq)
seq_lengths = [len(line) for line in seq]
plt.figure(figsize = (16, 5))
sns.distplot(seq_lengths, color = color,
norm_hist = True)
plt.xlabel("Sequence Length")
plt.title(title)
plt.show()
visualize_seq_lengths(all_data['Description'], "Distribution of Sequence Lengths", color = colors2['teal'])
0 [this, position, can, be, based, remotely, any...
1 [summary, imagine, what, you, could, do, here,...
2 [lead, analytics, and, measurement, efforts, f...
3 [about, gusto, gusto, is, a, modern, online, p...
4 [summary, at, apple, new, ideas, have, a, way,...
...
355 [the, new, york, post, is, searching, for, a, ...
356 [the, metro, group, inc, is, an, established, ...
357 [it, s, fun, to, work, in, a, company, where, ...
358 [summary, the, business, intelligence, bi, sol...
359 [location, new, york, new, york, global, about...
Name: Description, Length: 1703, dtype: object
/Users/mtaruno/opt/anaconda3/envs/cannon/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
# looking at how many headings there are
headings_df['Heading Title'].value_counts()
responsibilities 363
qualifications 358
job description 211
location 190
preferred qualifications 166
...
excellent analytical skills 1
ten years related work experience 1
internal jobcode 70127 1
experience with gcp 1
why ft partners 1
Name: Heading Title, Length: 1636, dtype: int64
# We can look at this separately for job description and job requirements
heading_name = 'job description'
headings_df[headings_df['Heading Title']== heading_name]
| ID | Heading Text | Heading Title | Frequency | Person/Job/Org/None | |
|---|---|---|---|---|---|
| 1217 | 15 | At Rockwell Automation, we offer transformativ... | job description | 189 | Job |
| 1218 | 26 | Resideo is reinventing how we look at the home... | job description | 189 | Job |
| 1219 | 31 | We’re hiring a Growth Analytics Data Scientist... | job description | 189 | Job |
| 1220 | 40 | Square provides tools and services that enable... | job description | 189 | Job |
| 1221 | 61 | NaN | job description | 189 | Job |
| ... | ... | ... | ... | ... | ... |
| 1423 | 1680 | NaN | job description | 189 | Job |
| 1424 | 1682 | The Mayor's Office of Contract Services (MOCS)... | job description | 189 | Job |
| 1425 | 1697 | The New York City Department of Health and Men... | job description | 189 | Job |
| 1426 | 1700 | NaN | job description | 189 | Job |
| 1427 | 1702 | Our clients are making investments in smart gr... | job description | 189 | Job |
211 rows × 5 columns
headings_df['Person/Job/Org/None'].value_counts()
Job 2383 Person 2293 Org 893 None 660 ? 52 Name: Person/Job/Org/None, dtype: int64
nlp.visualize_seq_lengths(ds['Description'], "Seq Length Dist for DS", color = colors2['red'])
nlp.visualize_seq_lengths(da['Description'], "Seq Length Dist for DA", color = colors2['blue'])
/Users/mtaruno/opt/anaconda3/envs/cannon/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
/Users/mtaruno/opt/anaconda3/envs/cannon/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
From just the word lengths alone, one might be able to infer that data science has more requirements.
I can do a Bayesian analysis to get the posterior distribution and find the probability that the mean sequence length for data science jobs are greater than the mean sequence length for data analysis jobs.
# import pymc3 as pm
# with pm.Model() as model:
# group1_mean = pm.Normal('group1_mean', mean_prior_mean, sd=mean_prior_std)
# group2_mean = pm.Normal('group2_mean', mean_prior_mean, sd=mean_prior_std)
headings_df.head()
| ID | Heading Text | Heading Title | Frequency | Person/Job/Org/None | |
|---|---|---|---|---|---|
| 0 | 0 | This is a unique opportunity to collaborate wi... | position summary | 56 | Job |
| 1 | 43 | We are looking for a full time Data Scientist ... | position summary | 56 | Job |
| 2 | 157 | The Senior Data Analyst is responsible for ana... | position summary | 56 | Job |
| 3 | 176 | In this role you'll be working on a team desig... | position summary | 56 | Job |
| 4 | 183 | The Senior Data Analyst is responsible for ana... | position summary | 56 | Job |
# Null values check
non_null_headings = headings_df[~headings_df['Heading Text'].isna()]
non_null_headings.isna().sum()
ID 0 Heading Text 0 Heading Title 0 Frequency 0 Person/Job/Org/None 774 dtype: int64
non_null_headings.head()
| ID | Heading Text | Heading Title | Frequency | Person/Job/Org/None | |
|---|---|---|---|---|---|
| 0 | 0 | This is a unique opportunity to collaborate wi... | position summary | 56 | Job |
| 1 | 43 | We are looking for a full time Data Scientist ... | position summary | 56 | Job |
| 2 | 157 | The Senior Data Analyst is responsible for ana... | position summary | 56 | Job |
| 3 | 176 | In this role you'll be working on a team desig... | position summary | 56 | Job |
| 4 | 183 | The Senior Data Analyst is responsible for ana... | position summary | 56 | Job |
non_null_headings.shape
(6373, 5)
heading_text_counts = nlp.to_wcdf(non_null_headings['Heading Text']).sum().sort_values(ascending=
False).iloc[:30]
bar(x = heading_text_counts.index, y = heading_text_counts.values,
title = "Visualizing Top 30 Most Frequent Words in Corpus")
heading_text_counts
experience 1488 business 1359 science 997 team 958 work 896 degree 782 analytics 715 years 696 analysis 544 computer 544 learning 540 related 528 product 497 new 476 engineering 475 field 458 research 452 management 436 analyst 436 health 431 technology 419 machine 418 support 398 solutions 396 skills 395 people 393 world 390 statistics 388 across 377 including 371 dtype: int64
# Looking at the distribution of words
nlp.visualize_counts(non_null_headings['Heading Text'])
/Users/mtaruno/opt/anaconda3/envs/cannon/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
nlp.visualize_seq_lengths(non_null_headings['Heading Text'])
/Users/mtaruno/opt/anaconda3/envs/cannon/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
%run ../bar.py
Great! Enabled axis to bar function
top_ds = set(ds_words.sort_values(ascending=False)[:50].index)
top_da = set(da_words.sort_values(ascending=False)[:50].index)
# these are words that appear in both the DS and DA
intersection = top_ds.intersection(top_da)
intersection
{'ability',
'across',
'analysis',
'analytics',
'business',
'company',
'degree',
'develop',
'help',
'including',
'information',
'insights',
'knowledge',
'new',
'opportunity',
'people',
'product',
'research',
'role',
'skills',
'solutions',
'sql',
'status',
'strong',
'support',
'team',
'teams',
'technical',
'tools',
'us',
'using',
'work',
'working',
'years'}
# words that are in ds but not in da
ds_not_da = top_ds - top_da
ds_not_da
{'build',
'building',
'design',
'development',
'engineering',
'gender',
'modeling',
'models',
'problems',
'products',
'python',
'statistical',
'statistics',
'techniques',
'use',
'world'}
# words that are in da but not in ds
da_not_ds = top_da - top_ds
da_not_ds
{'analytical',
'environment',
'health',
'job',
'management',
'position',
'projects',
'provide',
'quality',
'related',
'reporting',
'reports',
'required',
'requirements',
'systems',
'time'}
import plotly.express as px
# creating stacked bar chart of top intersection terms
ds_wc = nlp.to_wcdf(ds['Description']).sum()
da_wc = nlp.to_wcdf(da['Description']).sum()
da_wc_mask = [i in intersection for i in da_wc.index]
ds_wc_mask = [i in intersection for i in ds_wc.index]
ds_wc_final = ds_wc[ds_wc_mask].reset_index()
ds_wc_final["title"] = ["DS"] * ds_wc_final.shape[0]
da_wc_final = da_wc[da_wc_mask].reset_index()
da_wc_final["title"] = ["DA"] * da_wc_final.shape[0]
combined = pd.concat([ds_wc_final, da_wc_final], axis = 0)
combined = combined.rename(columns = {"index": "word", 0: "counts"}).sort_values("counts", ascending = False)
px.bar(combined, x = "word", y = "counts", color = "title", title = "Comparing Tokens That Appear in Both DS and DA",
barmode = "group")
# creating a word cloud for top ds words
ds_word_counts = ds_words.reset_index().rename(columns = {"index": "token", 0:"counts"})
# creating a word cloud for top ds words
da_word_counts = da_words.reset_index().rename(columns = {"index": "token", 0:"counts"})
top_ds_words = ds_word_counts[ds_word_counts['token'].apply(lambda x: x in ds_not_da)].sort_values("counts", ascending=False)
top_da_words = da_word_counts[da_word_counts['token'].apply(lambda x: x in da_not_ds)].sort_values("counts", ascending=False)
import random
da_text = []
ds_text = []
both_text = []
for i, j in zip(top_da_words['token'], top_da_words['counts']):
da_text += [i] * j
for i, j in zip(top_ds_words['token'], top_ds_words['counts']):
ds_text += [i] * j
# Needs shuffling in order to work
random.shuffle(da_text)
random.shuffle(ds_text)
# plot counts
da_counts = nlp.to_wcdf(da_text).sum().sort_values(ascending=False)
bar(da_counts.index, da_counts.values, title = "DA not in DS", color = colors2["blue"])
ds_counts = nlp.to_wcdf(ds_text).sum().sort_values(ascending=False)
bar(ds_counts.index, ds_counts.values, title = "DS not in DA", color = colors2["red"])
# convert list to string format
da_text = " ".join(da_text)
ds_text = " ".join(ds_text)
from wordcloud import WordCloud
def plotcloud(wordcloud):
plt.figure(figsize = (8,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
da_wc = WordCloud(max_font_size=50, max_words=100, background_color=colors2["blue"],
prefer_horizontal = 0,
stopwords = ['experience']).generate(da_text)
ds_wc = WordCloud(max_font_size=50, max_words=100, background_color=colors2['red'],
prefer_horizontal = 0,
stopwords = ['experience']).generate(ds_text)
print("DA Cloud \n")
plotcloud(da_wc)
print("DS Cloud \n")
plotcloud(ds_wc)
DA Cloud
DS Cloud
Ideas for analysis:
Spit out a CSV file with unique header examples. Then we can make dummy variables to see what's in the skill requirements vs what does the job do. It would be interesting to see when we start looking at different job titles, what words pop up in common across them? What's in a data scientist job description that is not in a job analyst job description?
What are words that appear in one vs the other? What is the range of job descriptions that we are pulling out?
Data Analyst has less requirements than Data Scientist, is that actually true? We can look at it by industry and location as well.
Sillicon Valley vs New York. Regional differences for expectations for technical capabilities?
Let's try a cool exercise and try to predict whether or not a job posting will be a data science job posting or a data analyst job posting.
%run ../bar.py
Great! Enabled axis to bar function
title = all_data['Job Title']
x = title.value_counts().index
y = title.value_counts().values
bar(x,y, title = "Job Title Distribution")
import datetime
# Let's stick to data science vs data analyst
data = all_data[(all_data['Job Title'] == "data scientist")|(all_data['Job Title'] == "data analyst")]
# Getting X feature vector
feature_names = nlp.to_wcdf(data['Description']).columns.tolist()
text_data = nlp.to_wcdf(data['Description']).to_numpy()
from sklearn.model_selection import train_test_split
# Focusing on the inbound text
X = text_data
y = data['Job Title']
# Splitting, with stratify param for class balance
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33,
random_state = 25, stratify = y)
headings_df['Heading Title'].value_counts()
responsibilities 363
qualifications 358
job description 211
location 190
preferred qualifications 166
...
excellent analytical skills 1
ten years related work experience 1
internal jobcode 70127 1
experience with gcp 1
why ft partners 1
Name: Heading Title, Length: 1636, dtype: int64
%time
# Initializing my pipeline
estimators = [('model', RandomForestClassifier())]
pipe = Pipeline(estimators)
# These are the hyperparamaters and models I want to tune
param_grid = [{'model': [RandomForestClassifier()],
'model__n_estimators': [*range(50,501,100)]},
]
# 5 fold cross validation
grid = GridSearchCV(pipe, param_grid, cv = 5, verbose = 3)
fitted_grid = grid.fit(X_train, y_train)
CPU times: user 3 µs, sys: 3 µs, total: 6 µs
Wall time: 12.2 µs
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50, score=0.930, total= 0.3s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50, score=0.904, total= 0.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.3s remaining: 0.0s [Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.4s remaining: 0.0s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50, score=0.852, total= 0.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50, score=0.930, total= 0.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=50, score=0.807, total= 0.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150, score=0.939, total= 0.6s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150, score=0.878, total= 0.6s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150, score=0.878, total= 0.5s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150, score=0.922, total= 0.4s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=150, score=0.851, total= 0.5s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250, score=0.957, total= 1.0s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250, score=0.904, total= 0.8s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250, score=0.861, total= 0.6s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250, score=0.896, total= 0.6s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=250, score=0.868, total= 0.8s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350, score=0.957, total= 0.9s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350, score=0.896, total= 0.8s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350, score=0.852, total= 0.8s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350, score=0.904, total= 0.8s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=350, score=0.868, total= 0.9s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450, score=0.922, total= 1.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450, score=0.904, total= 1.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450, score=0.870, total= 1.3s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450, score=0.904, total= 1.1s
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450
[CV] model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), model__n_estimators=450, score=0.860, total= 1.1s
[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 17.1s finished
from sklearn.externals import joblib
# getting string of now
now = str(datetime.datetime.now())[:16]
# Saving it because it took some time to run
joblib.dump(fitted_grid, f'../data/models/{now}.pkl', compress=1)
['../data/models/2021-04-28 21:37.pkl']
# Here we have a dataframe of all the fits under the different hyperparamaters
results = pd.DataFrame(fitted_grid.cv_results_).sort_values('mean_test_score', ascending = False)
results
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_model | param_model__n_estimators | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 0.755047 | 0.141881 | 0.015445 | 0.003750 | RandomForestClassifier(bootstrap=True, ccp_alp... | 250 | {'model': RandomForestClassifier(bootstrap=Tru... | 0.956522 | 0.904348 | 0.860870 | 0.895652 | 0.868421 | 0.897162 | 0.033825 | 1 |
| 3 | 0.816151 | 0.036132 | 0.017999 | 0.000464 | RandomForestClassifier(bootstrap=True, ccp_alp... | 350 | {'model': RandomForestClassifier(bootstrap=Tru... | 0.956522 | 0.895652 | 0.852174 | 0.904348 | 0.868421 | 0.895423 | 0.035812 | 2 |
| 1 | 0.496576 | 0.089171 | 0.013503 | 0.003749 | RandomForestClassifier(bootstrap=True, ccp_alp... | 150 | {'model': RandomForestClassifier(bootstrap=Tru... | 0.939130 | 0.878261 | 0.878261 | 0.921739 | 0.850877 | 0.893654 | 0.032127 | 3 |
| 4 | 1.115917 | 0.082877 | 0.023660 | 0.000932 | RandomForestClassifier(bootstrap=True, ccp_alp... | 450 | {'model': RandomForestClassifier(bootstrap=Tru... | 0.921739 | 0.904348 | 0.869565 | 0.904348 | 0.859649 | 0.891930 | 0.023406 | 4 |
| 0 | 0.150759 | 0.055983 | 0.004534 | 0.001005 | RandomForestClassifier(bootstrap=True, ccp_alp... | 50 | {'model': RandomForestClassifier(bootstrap=Tru... | 0.930435 | 0.904348 | 0.852174 | 0.930435 | 0.807018 | 0.884882 | 0.048294 | 5 |
bar(x = [*range(len(results))], y = results['mean_test_score'],
title = 'Accuracies are Very Similar')
rf = grid.best_estimator_ # getting best param rf
rf.fit(X_train, y_train) # fitting this to get some results
Pipeline(memory=None,
steps=[('model',
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight=None, criterion='gini',
max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=250, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
print(plt.style.available)
['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
# Some more evaluation metrics with F1
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
plt.style.use('seaborn')
def evaluate(model):
# Scoring the model
print(f'Score on train: {model.score(X_train, y_train)}')
print(f'Score on test: {model.score(X_test, y_test)}')
y_pred = model.predict(X_test)
print('\nConfusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (10,8))
ConfusionMatrixDisplay(cm, display_labels = y.unique()).plot(xticks_rotation = 'vertical',
values_format = 'd')
plt.show()
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print('\nF1 Score:')
print(f1_score(y_test, y_pred, average = None))
evaluate(rf)
Score on train: 1.0 Score on test: 0.9190140845070423 Confusion Matrix:
<Figure size 720x576 with 0 Axes>
Classification Report:
precision recall f1-score support
data analyst 0.89 0.95 0.92 133
data scientist 0.95 0.89 0.92 151
accuracy 0.92 284
macro avg 0.92 0.92 0.92 284
weighted avg 0.92 0.92 0.92 284
F1 Score:
[0.91636364 0.92150171]
importances = pd.DataFrame({"Feature": feature_names, "Feature Importance": rf[0].feature_importances_})
importances.sort_values("Feature Importance", ascending = False, inplace = True)
importances.head(30)
| Feature | Feature Importance | |
|---|---|---|
| 2643 | machine | 0.026397 |
| 2525 | learning | 0.020729 |
| 3684 | reporting | 0.019383 |
| 3868 | science | 0.018598 |
| 1650 | excel | 0.012896 |
| 3502 | python | 0.011716 |
| 2825 | models | 0.010913 |
| 3872 | scientists | 0.009661 |
| 3202 | phd | 0.008929 |
| 233 | algorithms | 0.007832 |
| 3685 | reports | 0.007651 |
| 2817 | ml | 0.006797 |
| 457 | bachelor | 0.006444 |
| 603 | build | 0.005169 |
| 3319 | predictive | 0.004793 |
| 4089 | spark | 0.004693 |
| 4167 | statistics | 0.004665 |
| 326 | applied | 0.004412 |
| 1141 | dashboards | 0.004071 |
| 2426 | java | 0.003970 |
| 4358 | tensorflow | 0.003716 |
| 1692 | experimentation | 0.003589 |
| 3850 | scala | 0.003544 |
| 3853 | scale | 0.003417 |
| 4345 | techniques | 0.003372 |
| 2666 | management | 0.003355 |
| 1543 | engineering | 0.003188 |
| 4125 | sql | 0.003067 |
| 1170 | deep | 0.002972 |
| 2785 | microsoft | 0.002959 |
bar(x = importances.head(30).Feature, y = importances.head(30)['Feature Importance'],
title = "Which Tokens Are Important In Distinguishing Data Scientist vs Data Analyst Roles?")
First run:
Feature Feature Importance 281 analyst 0.055493 3873 scientist 0.044333 2526 learning 0.029539 2644 machine 0.022369 3869 science 0.017041 3503 python 0.016804 3685 reporting 0.015142 1651 excel 0.013871 3874 scientists 0.012198 3203 phd 0.010251 2818 ml 0.010073 2826 models 0.009907 458 bachelor 0.008129 4091 spark 0.006223 233 algorithms 0.006188 3686 reports 0.004794 4169 statistics 0.004112 1171 deep 0.004038 1693 experimentation 0.003922 3512 quality 0.003756 3216 physics 0.003279 327 applied 0.003085 3851 scala 0.002976 1562 ensure 0.002878 3391 production 0.002812 3320 predictive 0.002712 4360 tensorflow 0.002711 2427 java 0.002644 1142 dashboards 0.002616 604 build 0.002614